NYC Taxi

library(arrow)
library(tidyverse)

TLC Trip Record Data

tlc <- read_parquet(file = "~/Documents/data/yellow_tripdata_2022-06.parquet")
str(tlc)
## tibble [3,558,124 × 19] (S3: tbl_df/tbl/data.frame)
##  $ VendorID             : int [1:3558124] 1 1 2 1 1 2 2 1 2 2 ...
##  $ tpep_pickup_datetime : POSIXct[1:3558124], format: "2022-05-31 20:25:41" "2022-05-31 20:44:40" ...
##  $ tpep_dropoff_datetime: POSIXct[1:3558124], format: "2022-05-31 20:48:22" "2022-05-31 21:01:48" ...
##  $ passenger_count      : num [1:3558124] 1 1 1 2 0 1 1 1 1 1 ...
##  $ trip_distance        : num [1:3558124] 11 4.2 9.49 12.1 1.8 2.02 8.08 4.3 8.78 1.76 ...
##  $ RatecodeID           : num [1:3558124] 1 1 1 1 1 1 1 1 1 1 ...
##  $ store_and_fwd_flag   : chr [1:3558124] "N" "N" "N" "N" ...
##  $ PULocationID         : int [1:3558124] 70 170 264 132 140 148 158 246 197 48 ...
##  $ DOLocationID         : int [1:3558124] 48 226 113 17 163 158 116 262 191 186 ...
##  $ payment_type         : int [1:3558124] 1 1 1 2 1 1 1 1 1 1 ...
##  $ fare_amount          : num [1:3558124] 32 14 26 37 9 9 26.5 15 26.5 7.5 ...
##  $ extra                : num [1:3558124] 3 3 0.5 1.75 3 0.5 0.5 3 0.5 0.5 ...
##  $ mta_tax              : num [1:3558124] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ tip_amount           : num [1:3558124] 2 0 5 0 2.55 0.64 7.58 3.75 5.56 2.26 ...
##  $ tolls_amount         : num [1:3558124] 6.55 0 6.55 0 0 0 0 0 0 0 ...
##  $ improvement_surcharge: num [1:3558124] 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0.3 ...
##  $ total_amount         : num [1:3558124] 44.4 17.8 42.6 39.5 15.3 ...
##  $ congestion_surcharge : num [1:3558124] 2.5 2.5 2.5 0 2.5 2.5 2.5 2.5 0 2.5 ...
##  $ airport_fee          : num [1:3558124] 0 0 1.25 1.25 0 0 0 0 0 0 ...
library(lubridate)
tlc <- tlc %>% 
  mutate(
    date_pickup = date(tpep_pickup_datetime),
    date_dropoff = date(tpep_dropoff_datetime),
    hour_pickup = hours(tpep_pickup_datetime), 
    hour_dropoff = hours(tpep_dropoff_datetime),
    weekday_pickup = wday(tpep_pickup_datetime, label = TRUE),
    day_pickup = day(tpep_pickup_datetime)
    )
tlc %>% group_by(weekday_pickup) %>% 
  tally %>% 
  ggplot(aes(x = weekday_pickup, y = n)) + geom_bar(stat = "identity") 

tlc %>% group_by(weekday_pickup) %>% 
  summarise(mean_passanger = mean(passenger_count, na.rm = TRUE)) %>% 
  ggplot(aes(x = weekday_pickup, y = mean_passanger)) + geom_bar(stat = "identity") 

tlc %>% group_by(day_pickup) %>% 
  tally %>% 
  ggplot(aes(x = day_pickup, y = n)) + geom_line()

Geographic

library(sf)
library(ggmap)
tlc_zone <- st_read("~/Documents/taxi_zones/taxi_zones.shp", quiet = TRUE)

plot(tlc_zone)

tlc_zone <- st_transform(tlc_zone, crs = 4326)
 ggplot(tlc_zone) + geom_sf() + theme_inset()

our_neighborhood <- tlc_zone %>% 
  filter(zone == "Gramercy"|zone == "Kips Bay")

ggplot(tlc_zone) + geom_sf() + theme_inset() +
  geom_sf(data = our_neighborhood, fill = "red")

bbox <- st_bbox(tlc_zone) %>% as.numeric

nyc_map <- get_stamenmap(bbox = bbox, messaging = FALSE, zoom = 11, 
                         maptype = "toner-lite", format = c("png"))

ggmap(nyc_map) + 
  geom_sf(data = our_neighborhood, fill = "red", inherit.aes = FALSE)

#inherit.aes to use coordinates from data table, not nyc_map

ggmap(nyc_map) + geom_sf(data = joined_tbl, aes(fill = N), inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "A")

Creating our own

random_locs <- st_sample(our_neighborhood, type = "random", size = 10)
ggplot() + geom_sf(data = our_neighborhood) + geom_sf(data = random_locs)

Looping Over Zones

tlc_zone_manhattan <- tlc_zone %>% filter(borough == "Manhattan")

bbox_manhattan <- st_bbox(tlc_zone_manhattan) %>% as.numeric

manhattan_map <- get_stamenmap(bbox = bbox_manhattan, messaging = FALSE, zoom = 11, 
                         maptype = "toner-lite", format = c("png"))


storage <- list()
map_output <- ggmap(manhattan_map)

for (zone_id in 1:nrow(joined_tbl)){
  zone <- joined_tbl[zone_id, ]
  zone$N[is.na(zone$N)] <- 0
  sampled_points <- zone %>% st_sample(type = "random",
                                       size = round(zone$N/100))
  storage[[zone_id]] <- sampled_points
  map_output <- map_output + geom_sf(data = storage[[zone_id]], 
                                         inherit.aes = FALSE,
                                         size = 0.1, alpha = 0.1)
}

# if you do not specify manhattan in tlc_zone, use xlim and ylim to specify
# which part of the map you want:
# map_output + xlim(-74.05, -73.9) + ylim(c(40.655, 40.855))

map_output

Now let’s use our tools on Airbnb data

airbnb <- read_csv("~/Documents/data/listings.csv")
airbnb_sf <- airbnb %>% 
  select(longitude, latitude, price) %>%
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326)

joined_sf <- st_join(tlc_zone, airbnb_sf)
summary_sf <- joined_sf %>% group_by(zone) %>%
              summarise(N = n(), mean_price = mean(price, na.rm = TRUE))

ggmap(nyc_map) +
  geom_sf(data = summary_sf, aes(fill = mean_price),
          inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "B")

ggmap(nyc_map) +
  geom_sf(data = summary_sf, aes(fill = N),
          inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "B")

NYC Tree data

nyc_tree <- read_csv("~/Documents/data/2015_Street_Tree_Census_-_Tree_Data.csv")

nyc_tree <- nyc_tree %>%
            select(longitude, latitude, tree_dbh) %>%
            st_as_sf(coords = c("longitude", "latitude"), crs = 4326)

# ggplot(nyc_tree) + geom_sf()

joined_sf <- st_join(tlc_zone, nyc_tree %>% sample_n(1000))
summary_sf <- joined_sf %>% group_by(zone) %>% 
              summarise(N = n(), mean_dbh = mean(tree_dbh, na.rm = TRUE))

ggmap(nyc_map) +
  geom_sf(data = summary_sf, aes(fill = mean_dbh),
          inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "B")

ggmap(nyc_map) +
  geom_sf(data = summary_sf, aes(fill = N),
          inherit.aes = FALSE) +
  scale_fill_viridis_c(option = "B")